Entorhinal Cortex R Markdown

R version-4.1.0 import datasets file> Import Dataset > From Excel > Import

finalmatrix.xls: count matrix x= erv-loci locations, y= Sample names TREM2MasterAnnotation.xlsx = load meta data

#load install packages /libraries
library(dplyr)
library(tidyverse)
library(readxl)
library(DESeq2)
library(tibble)
library(plotly)
library(ggplot2)
library(EnhancedVolcano)
library(sva)
library(DT)
#EC
#load count data + ERV loci data frame

cts <- read_excel("~/Genomic Medicine/Project/msc_project/finalmatrix.xls") 
#View(cts)

colnames(cts) <-sub("\\.", "-", colnames(cts))

cts <- as.data.frame(cts)
rownames(cts) <-cts$`erv-id`

#remove all row and col with NA
cts <- cts[(1:789),]

#remove MCI "RDobson-38"
#remove HC "RDobson-132"
#remove gyrus"RDobson-129", "RDobson-130", "RDobson-131", "RDobson-133"
#remove failed sequencing  'RDobson-18'
#remaining 62 in total

# outliers , low quality  'RDobson-67', "RDobson-101"

ctsEC <-cts %>% select('erv-id',"RDobson-58", "RDobson-119", "RDobson-40",  "RDobson-113", "RDobson-76", "RDobson-67", "RDobson-101", "RDobson-10", "RDobson-111", "RDobson-120", "RDobson-125", "RDobson-91", "RDobson-4", "RDobson-31", "RDobson-45", "RDobson-63", "RDobson-29", "RDobson-33", "RDobson-109", "RDobson-80","RDobson-54", "RDobson-48", "RDobson-78", "RDobson-88", "RDobson-126", "RDobson-28", "RDobson-30", "RDobson-93", "RDobson-2", "RDobson-7", "RDobson-118", "RDobson-5", "RDobson-56", "RDobson-21", "RDobson-53", "RDobson-104", "RDobson-27", "RDobson-1", "RDobson-74", "RDobson-85", "RDobson-92", "RDobson-9", "RDobson-6", "RDobson-84", "RDobson-62", "RDobson-14", "RDobson-19", "RDobson-122", "RDobson-69", "RDobson-44", "RDobson-8", "RDobson-128", "RDobson-49", "RDobson-100", "RDobson-106", "RDobson-46", "RDobson-70", "RDobson-115", "RDobson-15", "RDobson-82", "RDobson-47")
#load metadata and select data fields of interest, rename field to contain no spaces

meta <- read_xlsx("~/Genomic Medicine/Project/TREM2MasterAnnotation.xlsx")
## New names:
## * `Subject id` -> `Subject id...1`
## * `Sample ID` -> `Sample ID...3`
## * `Subject id` -> `Subject id...16`
## * `RNA tube labels` -> `RNA tube labels...66`
## * `RNA tube labels` -> `RNA tube labels...67`
## * ...
#View(TREM2MasterAnnotation)

metaData <- select(meta, `Sample ID...73`, Diagnosis_1, Tissue , `Sex`, `Age (at death)`, `PostMortemDelay (hours)`, `RIN Score`, `No. of E4 alleles`, `Sequencing Pool`) 

metaEC<-metaData[metaData$Tissue == 'Entorhinal cortex',]

names(metaEC)[1] <-"Sample"
names(metaEC)[5] <-"Age_at_death"
names(metaEC)[6] <-"PostMortem_Delay_hours"
names(metaEC)[7] <-"RIN_Score"
names(metaEC)[8] <-"Num_of_E4_alleles"
names(metaEC)[9] <-"Seq_pool"

#specify bin values
metaEC$Num_of_E4_alleles <- as.factor(metaEC$Num_of_E4_alleles)

metaEC[is.na(metaEC)] <- 0


#place RIN value in quintiles bins 
#metaEC = mutate(metaEC, quantile_rank = ntile(metaEC$RIN_Score,5))
#metaEC$quantile_rank<- as.factor(metaEC$quantile_rank)

#place PostMortem_Delay_hours in quintile bins
#metaEC = mutate(metaEC, quantile_rank = ntile(metaEC$PostMortem_Delay_hours,5))
#metaEC$quantile_rank <- as.factor(metaEC$quantile_rank)

#place Age_at_death in quintile bins
#metaEC = mutate(metaEC, quantile_rank = ntile(metaEC$Age_at_death,5))
#metaEC$quantile_rank <- as.factor(metaEC$quantile_rank)


#only EC 
metaEC <- as.data.frame(metaEC)

# need to remove MCI RDobson-38
#rm RDobson-18 
metaEC <- metaEC[-c(6,9),]
#7,8
#run deseq2
dds <- DESeqDataSetFromMatrix(countData = ctsEC ,
                       colData = metaEC ,
                       design =~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 ,
                       tidy = TRUE)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#inital PCA to assess variance of data
 rld <- varianceStabilizingTransformation(dds)
  rld_pca <- plotPCA(rld, intgroup=c("Diagnosis_1" )) + 
    labs(title = "EC-PCA ") +
      aes(label = colnames(rld))
  rld_pca_plotly <- ggplotly(rld_pca)
  rld_pca_plotly
#correct for known batch effects using SVA and ComBat
# 'RDobson-67', "RDobson-101" replaced for SVA analysis
batch <- metaEC$Seq_pool
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 1", '1')
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 2", '2')
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 3", '3')
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 4", '4')
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 5", '5')
metaEC$Seq_pool <- replace(metaEC$Seq_pool, metaEC$Seq_pool == "Pool 6", '6')
metaEC$Seq_pool <- as.numeric(metaEC$Seq_pool)
dat <- as.matrix(ctsEC[2:ncol(ctsEC)])

modcombat = model.matrix(~1, data =(as.data.frame(metaEC$Diagnosis_1))) 

data_adjusted_EC <- ComBat_seq(dat, batch=batch, group = modcombat)
## Found 6 batches
## Using null model in ComBat-seq.
## Adjusting for 0 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
#plot PCA
pca_adjusted <-prcomp(data_adjusted_EC)
pca_adjustedEC<- summary(pca_adjusted)

#select pca 1 and pca 2, make matrix
PCA1 <- pca_adjustedEC$rotation[,1:2]

#add phenotype info
PCA1 <-cbind(PCA1,metaEC[c(1,2,4,5,6,7,8,9)])

#head(pca_adjustedEC)

#plot
ggplotly(ggplot(data = PCA1, 
                aes(x=PC1, y=PC2, col= Diagnosis_1))+
            xlab("PC1: 0.85% variance") +
           ylab("PC2: 0.04% variance") +
           geom_point()+
           ggtitle("EC- PCA after SVA"))
data_adjusted_EC <- as.matrix(data_adjusted_EC)

dds <- DESeqDataSetFromMatrix(countData = data_adjusted_EC ,
                       colData = metaEC ,
                       design =~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 )
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#analyse
ddsEC <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#get results
res <- results(ddsEC, contrast = c("Diagnosis_1","Control", "AD"))
#see volcano plot of 0.05 as p-value
res$ervid <- row.names(res)

EnhancedVolcano(res, 
                lab= rownames(res), 
                x ='log2FoldChange', y = 'padj',
                xlab = bquote(~Log[2]~ "fold change"),
                ylab = ~-Log[10]~adjusted~italic(P),
                title = "DESeq2 results EC",
                subtitle = "Differential expression Volcano Plot",
                legendPosition = "bottom",
                xlim = c(-2,2),
                ylim = c(0,3),
                pCutoff = 0.05)
## Warning: Ignoring unknown parameters: xlim, ylim

#look at res table as a sorted data.frame
datatable(as.data.frame(res))
#check if any significant EC ERV loci are present in BA9
#c('5867', '3276', '3403', '3537') %in% resSig$ervid
#loci 3537 TRUE